Dicaeum gene tree coalescent heights investigation

0.1 load libraries

library(ape)
#install.packages("phangorn")
library(phangorn)

0.2 Info on function inputs

Here we will be using a single function to perform the gene tree topology sorting and coalescent height retrieval. The exact details of the required inputs for this function are explained below:

#################
#required inputs#
#################

#use 'sl_aln' to specify a set of single locus fasta alignments read into the working directory as a list of objects each of class 'DNAbin'. Example code for how to create an appropriate input for this variable using a for loop around the ape function 'read.dna()' is shown below in the section called "Bring in alignments and create the 'sl_aln' variable".

#use 'popmap' to specify a dataframe that gives each sample ID and its corresponding population for the alignments specified as 'sl_aln'.
#Must be a dataframe with two columns, first a character vector named 'id', and second a character vector named 'pops' with levels = c('P1','P2','P3','OG')
#e.g.,
#id          | pop
#---------------------
#D_hypo_123  | P1
#D_hypo_158  | P2
#D_hypo_199  | P3
#D_ni_555    | OG

#use 'quartet.samps' to designate the number of random quartets you want sampled (must be an integer)
#e.g., quartet.samps=1

#use 'model' to designate the sequence evolution model you want to use (options are 'raw', 'F84', or 'JC69'):
#e.g., model="raw"

#use 'treebuilder' to designate the approach you want to use for building gene trees (options are 'nj' or 'upgma')
#e.g., tree.builder="nj"

#################
#optional inputs#
#################

#use 'out.table' to specify the full output path and filename where you would like the results table saved (must be .txt extension)
#e.g., out.table="~/Desktop/coal.heights.txt"

0.3 Bring in alignments and create the ‘sl_aln’ variable

‘sl_aln’ is a list of single-locus fasta alignments. I use the code below to read them into R working memory and store them in a single sequential list object.

#Designate the directory where each of your single-locus fasta alignments are stored (must end with /)
sl_aln_dir<-"~/Desktop/phil.dicaeum/astral/fastas/"
#create list of single-locus alignment file names
sl_listfile<-list.files(path = sl_aln_dir)
#open an empty list to store each alignment
sl_aln<-list()
#use a loop to read in each fasta alignment and store them all in the list named 'sl_aln'
for (i in 1:length(sl_listfile)){
  #Read the alignments into local memory
  sl_aln[[i]]<-read.dna(file = paste(sl_aln_dir, sl_listfile[i], sep = ""), format = "fasta")
}

0.4 Bring in sample info and define ‘popmap’ variable

‘popmap’ is a dataframe mapping each sample in the alignment to one of 4 pre-designated species tips (details below). I read in and clean that dataframe using the following code:

#read in sampling sheet
popmap<-read.csv("~/Desktop/phil.dicaeum/dicaeum.retained.sampling.csv")[,c(1,5)]
#get exact names in the fasta alignment
fas<-names(read.FASTA("~/Desktop/phil.dicaeum/astral/fastas/locus.1.fasta"))
#add those to the sampling sheet and check that the order matches
popmap$id<-fas
#rename taxa to match the needed levels ('P1','P2','P3','OG')
popmap$pop<-gsub("Zamboanga","P1",gsub("Mindanao","P2",gsub("Luzon","P3",gsub("nigrilore","OG",popmap$Taxa))))
#subset out the relevant columns
popmap<-popmap[,c(3,4)]
#check that this looks as expected
head(popmap)
        id pop
1 hyp18070  P3
2 hyp18191  P1
3 hyp14037  P2
4 hyp14079  P2
5 hyp14065  P2
6 hyp14120  P2

0.5 Define a function to calculate topology frequencies and associated coalescent heights

The exact details of the required inputs for this function are explained above

#################
#define function#
#################

#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights.dummy<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
  #randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
  #P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
  #P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
  #P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
  #P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)
  P1<-popmap$id[2]
  P2<-popmap$id[3]
  P3<-popmap$id[1]
  P_out<-popmap$id[55]
  
  #open empty vectors to hold key variables for the given sampling scheme
  trees<-c()
  topology<-c()
  sister.dist<-c()
  aln.num<-c()
  
    #begin the fasta alignment loop
    for (i in 1:length(sl_aln)){
      #Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
      sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
      
      #convert this alignment to the proper class
      dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
      
      #Get the neighbor joining topology of tree
      if(tree.builder == "nj"){tree<-nj(dist_mat)}
      else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}

      #Root the tree
      root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
      
      #record the tree topology
      trees[i]<-write.tree(root_tree)
      
        #Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
        if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
          topology[i]<-"unresolved"
          sister.dist[i]<-NA
        }
        else if(is.monophyletic(tree, tips = c(1, 2))){
          topology[i]<-"12top"
          sister.dist[i]<-dist_mat[1,2]
        }
        else if(is.monophyletic(tree, tips = c(2, 3))){
          topology[i]<-"23top"
          sister.dist[i]<-dist_mat[2,3]
        }
        else if(is.monophyletic(tree, tips = c(1, 3))){
          topology[i]<-"13top"
          sister.dist[i]<-dist_mat[1,3]
        }
      
      #record the alignment number
      aln.num[i]<-i
      
    } #end the loop that iterates over each input fasta alignment

  #combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
  top.dist<-rbind(top.dist,
                data.frame(rand.samp=rep(k, times=length(aln.num)),
                           alignment=aln.num,
                           topology=topology,
                           tree=trees,
                           sister.dist=sister.dist))

} #end random sampling loop

#
print((table(top.dist$topology)))

#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))

return(top.dist)

#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}

} #close function

0.6 run the function specifying “raw” divergence and “neighbor-joining” tree building

This function is specified as ‘dummy’ because it is not actually doing the random sampling, it is just picking the same individuals each time, so that we can determine whether the models chosen are affecting our inferences on the output

raw.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="nj")

     12top      13top      23top unresolved 
       173         46         70       2301 

0.7 “F84” divergence and “neighbor-joining” tree building

F84.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="nj")

     12top      13top      23top unresolved 
       235        132         87       2136 

0.8 “raw” divergence and “UPGMA” tree building

raw.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
       141         99        185       2135 

0.9 “F84” divergence and “UPGMA” tree building

F84.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
       128         98        194       2134 

redefine the dummy function using a sample from mt busa and see if that changes things

#################
#define function#
#################

#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights.dummy<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
  #randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
  #P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
  #P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
  #P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
  #P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)
  P1<-popmap$id[2]
  P2<-popmap$id[23]
  P3<-popmap$id[1]
  P_out<-popmap$id[55]
  
  #open empty vectors to hold key variables for the given sampling scheme
  trees<-c()
  topology<-c()
  sister.dist<-c()
  aln.num<-c()
  
    #begin the fasta alignment loop
    for (i in 1:length(sl_aln)){
      #Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
      sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
      
      #convert this alignment to the proper class
      dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
      
      #Get the neighbor joining topology of tree
      if(tree.builder == "nj"){tree<-nj(dist_mat)}
      else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}

      #Root the tree
      root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
      
      #record the tree topology
      trees[i]<-write.tree(root_tree)
      
        #Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
        if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
          topology[i]<-"unresolved"
          sister.dist[i]<-NA
        }
        else if(is.monophyletic(tree, tips = c(1, 2))){
          topology[i]<-"12top"
          sister.dist[i]<-dist_mat[1,2]
        }
        else if(is.monophyletic(tree, tips = c(2, 3))){
          topology[i]<-"23top"
          sister.dist[i]<-dist_mat[2,3]
        }
        else if(is.monophyletic(tree, tips = c(1, 3))){
          topology[i]<-"13top"
          sister.dist[i]<-dist_mat[1,3]
        }
      
      #record the alignment number
      aln.num[i]<-i
      
    } #end the loop that iterates over each input fasta alignment

  #combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
  top.dist<-rbind(top.dist,
                data.frame(rand.samp=rep(k, times=length(aln.num)),
                           alignment=aln.num,
                           topology=topology,
                           tree=trees,
                           sister.dist=sister.dist))

} #end random sampling loop

#
print((table(top.dist$topology)))

#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))

return(top.dist)

#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}

} #close function

0.10 run the function specifying “raw” divergence and “neighbor-joining” tree building

This function is specified as ‘dummy’ because it is not actually doing the random sampling, it is just picking the same individuals each time, so that we can determine whether the models chosen are affecting our inferences on the output

raw.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="nj")

     12top      13top      23top unresolved 
       169         31         64       2326 

0.11 “F84” divergence and “neighbor-joining” tree building

F84.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="nj")

     12top      13top      23top unresolved 
       220        126         73       2171 

0.12 “raw” divergence and “UPGMA” tree building

raw.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
       160         74        151       2173 

0.13 “F84” divergence and “UPGMA” tree building

F84.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
       150         79        155       2170 

define the real function allowing for random quartet sampling

#################
#define function#
#################

#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
  #randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
  P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
  P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
  P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
  P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)

  #open empty vectors to hold key variables for the given sampling scheme
  trees<-c()
  topology<-c()
  sister.dist<-c()
  aln.num<-c()
  
    #begin the fasta alignment loop
    for (i in 1:length(sl_aln)){
      #Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
      sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
      
      #convert this alignment to the proper class
      dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
      
      #Get the neighbor joining topology of tree
      if(tree.builder == "nj"){tree<-nj(dist_mat)}
      else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}

      #Root the tree
      root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
      
      #record the tree topology
      trees[i]<-write.tree(root_tree)
      
        #Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
        if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
          topology[i]<-"unresolved"
          sister.dist[i]<-NA
        }
        else if(is.monophyletic(tree, tips = c(1, 2))){
          topology[i]<-"12top"
          sister.dist[i]<-dist_mat[1,2]
        }
        else if(is.monophyletic(tree, tips = c(2, 3))){
          topology[i]<-"23top"
          sister.dist[i]<-dist_mat[2,3]
        }
        else if(is.monophyletic(tree, tips = c(1, 3))){
          topology[i]<-"13top"
          sister.dist[i]<-dist_mat[1,3]
        }
      
      #record the alignment number
      aln.num[i]<-i
      
    } #end the loop that iterates over each input fasta alignment

  #combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
  top.dist<-rbind(top.dist,
                data.frame(rand.samp=rep(k, times=length(aln.num)),
                           alignment=aln.num,
                           topology=topology,
                           tree=trees,
                           sister.dist=sister.dist))

} #end random sampling loop

#
print((table(top.dist$topology)))

#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
     xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))

return(top.dist)

#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}

} #close function

0.14 run the function specifying “raw” divergence and “neighbor-joining” tree building

each time sample 10 random quartets and give the combined output across all iterations

raw.nj<-calc.heights(sl_aln, popmap, quartet.samps=10, model="raw", tree.builder="nj")

     12top      13top      23top unresolved 
      1414        448        645      23393 

0.15 “F84” divergence and “neighbor-joining” tree building

F84.nj<-calc.heights(sl_aln, popmap, quartet.samps=10, model="F84", tree.builder="nj")

     12top      13top      23top unresolved 
      1970       1160        833      21937 

0.16 “raw” divergence and “UPGMA” tree building

raw.upgma<-calc.heights(sl_aln, popmap, quartet.samps=10, model="raw", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
      1228        876       1460      22069 

0.17 “F84” divergence and “UPGMA” tree building

F84.upgma<-calc.heights(sl_aln, popmap, quartet.samps=10, model="F84", tree.builder="upgma", out.table)

     12top      13top      23top unresolved 
      1164        842       1567      22043 

0.18 Plot the distribution of coal heights under a ‘raw NJ’ framework across 100 random quartet samples

#run 100 random samples
raw.nj<-calc.heights(sl_aln, popmap, quartet.samps=100, model="raw", tree.builder="nj")

     12top      13top      23top unresolved 
     13263       4394       6291     235052 

#extract mean of each rep for each topology and plot histograms
dist12<-c()
dist23<-c()
dist13<-c()
for (i in 1:length(levels(as.factor(raw.nj$rand.samp)))){
  #isolate this replicate
  dist12[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "12top"])
  dist23[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "23top"])
  dist13[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "13top"])
}

df<-data.frame(replicate=c(1:length(levels(as.factor(raw.nj$rand.samp)))),
               dist12.means=dist12,
               dist23.means=dist23,
               dist13.means=dist13)

c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
c3 <- rgb(0,0,0, max = 255, alpha = 80, names = "lt.pink")

hist(df$dist12.means, breaks = seq(0,0.02,.001), col=c1, xlim = c(0,0.02), xaxp=c(0,0.02,5), xlab = "mean raw dist", main = "sister topology coalescent heights\nblue=(P1,P2), pink=(P2,P3), gray=(P1,P3)")
abline(v=mean(df$dist12.means), lty="dashed")

hist(df$dist23.means, breaks = seq(0,0.02,.001), col=c2, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist23.means), lty="dashed")

hist(df$dist13.means, breaks = seq(0,0.02,.001), col=c3, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist13.means), lty="dashed")

hist(df$dist12.means, breaks = seq(0,0.02,.001), col=c1, xlim = c(0,0.02), xaxp=c(0,0.02,5), xlab = "mean raw dist", main = "sister topology coalescent heights\nblue=(P1,P2), pink=(P2,P3), gray=(P1,P3)")
abline(v=mean(df$dist12.means), lty="dashed")

hist(df$dist23.means, breaks = seq(0,0.02,.001), col=c2, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist23.means), lty="dashed")